1. Male and female graduate enrollment (1967 - 2015). Compare trends in total graduate enrollment for males and females (including full-time/part-time and private/public universities) in the United States from 1967 - 2015. Describe your results statistically, graphically and in text.

What if we want to describe the way that one variable changes with respect to another variable? Then we might use regression to:

4 assumptions for linear regression

Linear Regression Steps:

# Read in dataset
enroll_totals <- read_csv("Grad enrollment 1967 - 2015.csv")
## Parsed with column specification:
## cols(
##   year = col_integer(),
##   total = col_integer(),
##   total_males = col_integer(),
##   total_females = col_integer()
## )
#### Test for linearity ####
plot(enroll_totals$year, enroll_totals$total_males)

# male enrollment increases at a linear trend by year. 


plot(enroll_totals$year, enroll_totals$total_females)

# female enrollment increases at a linear trend by year

#### Run regressions ####

# linear regression for males predicted by year

males_lm <- lm(total_males ~ year, data = enroll_totals)
summary(males_lm)
## 
## Call:
## lm(formula = total_males ~ year, data = enroll_totals)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96461 -34861 -12841  51876  95766 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17112153    1087024  -15.74   <2e-16 ***
## year             9069        546   16.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54050 on 47 degrees of freedom
## Multiple R-squared:  0.8545, Adjusted R-squared:  0.8514 
## F-statistic:   276 on 1 and 47 DF,  p-value: < 2.2e-16
# total_male_enrollment = -17112153 + 9069*year

# check diagnostic plots
par(mfrow = c(2,2))
plot(males_lm)

# residuals are normally distributed


# linear regression for females predicted by year

females_lm <- lm(total_females ~ year, data = enroll_totals)
summary(females_lm)
## 
## Call:
## lm(formula = total_females ~ year, data = enroll_totals)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -89397 -48101  -7633  45267 129727 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.896e+07  1.161e+06  -50.77   <2e-16 ***
## year         3.013e+04  5.832e+02   51.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57730 on 47 degrees of freedom
## Multiple R-squared:  0.9827, Adjusted R-squared:  0.9823 
## F-statistic:  2669 on 1 and 47 DF,  p-value: < 2.2e-16
females_lm
## 
## Call:
## lm(formula = total_females ~ year, data = enroll_totals)
## 
## Coefficients:
## (Intercept)         year  
##   -58955502        30126
#total_female_enrollment = -5.896e+07 + 3.013e+04*year

# at year 0 for both models, we have negative enrollment, which is impossible. This means that we likely will not have a good model for past models. 

# check diagnostic plots
par(mfrow = c(2,2))
plot(females_lm)

# residuals are normally distributed

# correlation testing for males vs year
cor.test(enroll_totals$year, enroll_totals$total_males)
## 
##  Pearson's product-moment correlation
## 
## data:  enroll_totals$year and enroll_totals$total_males
## t = 16.612, df = 47, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8690777 0.9568547
## sample estimates:
##       cor 
## 0.9243741
# strong positive correlation

# correlation testing for females vs year 
cor.test(enroll_totals$year, enroll_totals$total_females)
## 
##  Pearson's product-moment correlation
## 
## data:  enroll_totals$year and enroll_totals$total_females
## t = 51.659, df = 47, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9845609 0.9951144
## sample estimates:
##       cor 
## 0.9913086
#even stronger positive correlation

# Need to interpret.
m_f_enroll_lmtbl <- stargazer(males_lm, females_lm, type = "html", out = "AIC.htm")
Dependent variable:
total_males total_females
(1) (2)
year 9,069.301*** 30,126.030***
(545.955) (583.175)
Constant -17,112,153.000*** -58,955,502.000***
(1,087,024.000) (1,161,132.000)
Observations 49 49
R2 0.854 0.983
Adjusted R2 0.851 0.982
Residual Std. Error (df = 47) 54,046.810 57,731.420
F Statistic (df = 1; 47) 275.952*** 2,668.611***
Note: p<0.1; p<0.05; p<0.01
# FINAL GRAPH: Make a time series graph with year on x axis and totals on y axis

totals_line <- ggplot(enroll_totals, aes(x = year)) +
  geom_line(aes(y = total_males, color = "blue")) +
  geom_line(aes(y = total_females, color = "red")) + 
  theme_classic() +                                                      
  scale_x_continuous(expand = c(0,0), breaks = seq(1967, 2015, by = 5), limits = c(1967, 2015)) + 
  scale_y_continuous(breaks = seq(0, 2200000, by = 200000)) +
  scale_color_manual(values = c("#00AFBB", "#E7B800"), labels = c("Males", "Females")) +
  labs(x = "Year", y = "Enrollment Totals") +
  theme(text = element_text(family = "Century Schoolbook"), legend.title=element_blank())

totals_line

# FINAL GRAPH: plot the regressions with geom_smooth
regression_graphs <- ggplot(enroll_totals, aes(x = year)) +
  geom_point(aes(y = total_males, color = "blue")) + 
  geom_smooth(aes(y = total_males, color = "blue"), method = "lm", se = TRUE) +
  geom_point(aes(y = total_females, color = "red")) +
  geom_smooth(aes(y = total_females, color = "red"), method = "lm", se = TRUE) +
  theme_classic() +
  labs(x = "Year", y = "Enrollment Totals") +
  scale_x_continuous(expand = c(0,0), breaks = seq(1967, 2015, by = 5), limits = c(1967, 2015)) +
  scale_y_continuous(breaks = seq(0, 2200000, by = 200000)) +
  scale_color_manual(values = c("skyblue1", "salmon"), labels = c("Males", "Females")) +
  theme(text = element_text(family = "Century Schoolbook"), legend.title=element_blank()) +
  annotate("text", x = 1997, y = 1600000, label = "y = -58,955,502 + 30,126x", family = "Century Schoolbook") +
   annotate("text", x = 1997, y = 1500000, label = expression(R^2~"="~0.98) , family = "Century Schoolbook") + annotate("text", x = 1997, y = 1700000, label = "Females:", family = "Century Schoolbook") +
  annotate("text", x = 2002, y = 775000, label = "y = -17,112,153 + 9,069x", family = "Century Schoolbook") +
   annotate("text", x = 2002, y = 700000, label = expression(R^2~"="~0.85) , family = "Century Schoolbook") + annotate("text", x = 2002, y = 850000, label = "Males:", family = "Century Schoolbook") 
  
  
  
regression_graphs
## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'

## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'

female: y = …

male: y = …

2. Shifts in female PhD recipients by field (1985, 2000, and 2015). Describe if and how there was a shift in PhDs awarded to females in four fields (Physical and Earth Sciences, Engineering, Education, and Humanities & Arts) in 1985, 2000, and 2015. Describe your results statistically, in a graph or table, and in text. Note: There are several ways that you can interpret this question. You are invited to decide which you think is/are most interesting. Just be really clear about what you are asking/answering in your report.

female_phd <- read_csv("PhDs by Field 1985 - 2015.csv")
## Parsed with column specification:
## cols(
##   year = col_integer(),
##   sciences = col_number(),
##   engineering = col_number(),
##   education = col_number(),
##   humanities = col_number()
## )
female_phd1 <- select(female_phd, -year)  # delete year column

rownames(female_phd1) <- c("1985", "2000", "2015")   # name rows as year
## Warning: Setting row names on a tibble is deprecated.
female_phd_prop <- prop.table(as.matrix(female_phd1), 1)  #create proportion table

female_phd_x2 <- chisq.test(female_phd1)   #perform chisquare test

female_phd_x2  ####### ilayda add total enrollment column #######
## 
##  Pearson's Chi-squared test
## 
## data:  female_phd1
## X-squared = 2073.3, df = 6, p-value < 2.2e-16
# females moved out of education, and into the sciences and engineering (engineering especially). The humanities generally stayed the same. 

# Ilayda make table adding in the total enrollment of females column 
#adding stacked barchart to see if we want to use it
stacked_bar_data <- read_csv("female_phd_stackedbar.csv")
## Parsed with column specification:
## cols(
##   year = col_number(),
##   field_of_study = col_character(),
##   degrees_awarded = col_number()
## )
stacked_bar_data$year <- as.character(stacked_bar_data$year)

stacked_bar <- ggplot(stacked_bar_data, aes(x = year, y = degrees_awarded, fill = field_of_study)) +
  geom_bar(stat = "identity") +
  theme_classic() +
  scale_x_discrete(expand = c(0,0)) +
  scale_y_continuous(expand = c(0,0)) +
  theme(text = element_text(family = "Century Schoolbook")) +
  scale_fill_discrete(name = "Field of Study") +
  labs(x = "Year", y = "Degrees Awarded")  + 
  annotate("text", x = 1, y = 270, label = "10.1%", family = "Century Schoolbook") +
  annotate("text", x = 1, y = 1300, label = "24.6%", family = "Century Schoolbook") +
  annotate("text", x = 1, y = 2075, label = "3.5%", family = "Century Schoolbook") +
  annotate("text", x = 1, y = 3700, label = "61.8%", family = "Century Schoolbook") +
  annotate("text", x = 2, y = 500, label = "11.7%", family = "Century Schoolbook") +
  annotate("text", x = 2, y = 2400, label = "30.7%", family = "Century Schoolbook") +
  annotate("text", x = 2, y = 4100, label = "9.6%", family = "Century Schoolbook") +
  annotate("text", x = 2, y = 6500, label = "48.0%", family = "Century Schoolbook") +
  annotate("text", x = 3, y = 900, label = "18.7%", family = "Century Schoolbook") +
  annotate("text", x = 3, y = 3300, label = "26.7%", family = "Century Schoolbook") +
  annotate("text", x = 3, y = 6000, label = "21.7%", family = "Century Schoolbook") +
  annotate("text", x = 3, y = 8800, label = "32.9%", family = "Century Schoolbook") 
  
  
stacked_bar

3. Male and female salaries for starting postdoctoral and other employment positions (2015). Compare median salaries for male and female doctorate recipients in 2015. Answer these two questions: Does median salary differ significantly between male and female starting postdoc positions? Does median salary differ significantly between male and female PhD recipients in non-postdoc employment positions?

med_sal <- read_csv("Median salary for doctoral recipients cleaned.csv")
## Parsed with column specification:
## cols(
##   field_of_study = col_character(),
##   p_sal_m = col_number(),
##   p_sal_f = col_number(),
##   np_sal_m = col_number(),
##   np_sal_f = col_number()
## )
######################## Post Doctoral Work ############################
# Test for equal variance
# H0: Variances are equal
# HA: Variances are not equal

var_test_p <- var.test(med_sal$p_sal_m, med_sal$p_sal_f)
var_test_p
## 
##  F test to compare two variances
## 
## data:  med_sal$p_sal_m and med_sal$p_sal_f
## F = 0.90255, num df = 14, denom df = 14, p-value = 0.8506
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.3030138 2.6883336
## sample estimates:
## ratio of variances 
##          0.9025532
 # retain the null hypothesis that the variances are equal

# Exploratory graphs
# hist for male median salary postdoctoral


p_male_hist <- ggplot(aes(x = p_sal_m), data = med_sal) +
  geom_histogram(bins = 4.932424)
p_male_hist

# seems relatively normal

p_female_hist <- ggplot(aes(x = p_sal_f), data = med_sal) +
  geom_histogram(bins = 4.932424)
p_female_hist

# seems relatively normal


# H0: There is not a significant difference in ranks for male and female post doc positions
# HA: There is a significant difference in ranks for male and female post doc positions
wsr_salaries_p <- wilcox.test(med_sal$p_sal_m, med_sal$p_sal_f, paired = TRUE) 
## Warning in wilcox.test.default(med_sal$p_sal_m, med_sal$p_sal_f, paired =
## TRUE): cannot compute exact p-value with ties
## Warning in wilcox.test.default(med_sal$p_sal_m, med_sal$p_sal_f, paired =
## TRUE): cannot compute exact p-value with zeroes
wsr_salaries_p
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  med_sal$p_sal_m and med_sal$p_sal_f
## V = 19.5, p-value = 0.8884
## alternative hypothesis: true location shift is not equal to 0
# p = 0.8671
# this tells us to retain null. There is no significant diff between male and female post doc positions among field.

# Cliffs delta
p_sal_delta <- cliff.delta(med_sal$p_sal_m, med_sal$p_sal_f)
p_sal_delta
## 
## Cliff's Delta
## 
## delta estimate: 0.04 (negligible)
## 95 percent confidence interval:
##        inf        sup 
## -0.3710946  0.4379849
# effect size is 0.04 (negligible)

# Actual difference in medians
# median(med_sal$p_sal_m) - median(med_sal$p_sal_f)
# difference = 3000

med_sal_p <- med_sal %>%
  select(field_of_study, p_sal_m, p_sal_f) %>%
  mutate(difference = p_sal_m - p_sal_f)

########################## non postdoctoral work #########################################
# Test for equal variance
# H0: Variances are equal
# HA: Variances are not equal

var_test_np <- var.test(med_sal$np_sal_m, med_sal$np_sal_f)
var_test_np
## 
##  F test to compare two variances
## 
## data:  med_sal$np_sal_m and med_sal$np_sal_f
## F = 1.0856, num df = 14, denom df = 14, p-value = 0.88
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.3644779 3.2336422
## sample estimates:
## ratio of variances 
##           1.085629
 # retain the null hypothesis that the variances are equal

# Exploratory graphs
# hist for male median salary postdoctoral


np_male_hist <- ggplot(aes(x = np_sal_m), data = med_sal) +
  geom_histogram(bins = 4.932424)
np_male_hist

# seems relatively normal

np_female_hist <- ggplot(aes(x = np_sal_f), data = med_sal) +
  geom_histogram(bins = 4.932424)
np_female_hist

# seems relatively normal


# H0: There is not a significant difference in ranks for male and female non post doc positions
# HA: There is a significant difference in ranks for male and female non post doc positions
wsr_salaries_np <- wilcox.test(med_sal$np_sal_m, med_sal$np_sal_f, paired = TRUE) 
## Warning in wilcox.test.default(med_sal$np_sal_m, med_sal$np_sal_f, paired =
## TRUE): cannot compute exact p-value with ties
## Warning in wilcox.test.default(med_sal$np_sal_m, med_sal$np_sal_f, paired =
## TRUE): cannot compute exact p-value with zeroes
wsr_salaries_np
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  med_sal$np_sal_m and med_sal$np_sal_f
## V = 101, p-value = 0.002572
## alternative hypothesis: true location shift is not equal to 0
# p = 0.002572
# this tells us to reject null. There is significant diff between male and female non post doc positions among field of study. 

# Cliffs delta
np_sal_delta <- cliff.delta(med_sal$np_sal_m, med_sal$np_sal_f)
np_sal_delta
## 
## Cliff's Delta
## 
## delta estimate: 0.2133333 (small)
## 95 percent confidence interval:
##        inf        sup 
## -0.2155378  0.5732121
# effect size is 0.213333 (SMALL)

# Actual difference in medians
# median(med_sal$np_sal_m) - median(med_sal$np_sal_f)
# difference = 3417

med_sal_np <- med_sal %>%
  select(field_of_study, np_sal_m, np_sal_f) %>%
  mutate(difference = np_sal_m - np_sal_f)
# read in new med_sal data
med_sal_bar <- read_csv("med_sal_bar.csv")
## Parsed with column specification:
## cols(
##   field_of_study = col_character(),
##   salary_np = col_number(),
##   gender = col_character(),
##   salary_p = col_number()
## )
# column graph of differences for non postdoctoral work
np_bar_salaries <- ggplot(med_sal_bar, aes(x = field_of_study, y = salary_np, fill = gender)) +
  geom_col(stat = "identity", position = "dodge") +
  coord_flip() +
  theme_classic() +
  scale_x_discrete(expand = c(0,0)) +
  scale_y_continuous(expand = c(0,0)) +
  labs(x = "Field of Study", y = "Median Salary ($)") +
  theme(text = element_text(family = "Century Schoolbook"), legend.title=element_blank()) +
  scale_fill_manual( values = c("salmon", "skyblue1"),labels = c("Female", "Male"))
## Warning: Ignoring unknown parameters: stat
np_bar_salaries

# column graph of differences for postdoctoral work
p_bar_salaries <- ggplot(med_sal_bar, aes(x = field_of_study, y = salary_p, fill = gender)) +
  geom_col(stat = "identity", position = "dodge") +
  coord_flip() +
  theme_classic() +
  scale_x_discrete(expand = c(0,0)) +
  scale_y_continuous(expand = c(0,0)) +
  labs(x = "Field of Study", y = "Median Salary ($)") +
  scale_fill_manual(values = c("salmon", "skyblue1"),labels = c("Female", "Male")) +
  theme(text = element_text(family = "Century Schoolbook"), legend.title=element_blank())
## Warning: Ignoring unknown parameters: stat
p_bar_salaries

4. Exploring academic salaries for professors in U.S. colleges. Explore relationships between variables in the ‘Faculty salary data (2008 - 2009 survey)’ dataset. Develop a model describing faculty salary based on data for faculty sex, rank, years in current position, field, and number of years since doctoral degree was earned. You should make decisions regarding which variables should remain in your final model. Describe the results qualitatively and quantitatively (i.e., don’t just report the statistical results of the model – make sure you describe interesting findings in text). You can also discuss any concerns that you have with the model(s) you present, if any.

H0: The model does not significantly predict the outcome (i.e., there is no relationship between the model and the outcome; no variables significantly predict outcome) HA: AT LEAST ONE of the explanatory variables significantly predicts the outcome

How do I know which model is the BEST one?

Signs and Symptoms of Collinearity

Variance Inflation Factor

faculty_salary <- read_csv("Faculty salary data (2008 - 2009 survey).csv")
## Parsed with column specification:
## cols(
##   rank = col_character(),
##   field = col_character(),
##   years_since = col_integer(),
##   years_service = col_integer(),
##   sex = col_character(),
##   salary = col_integer()
## )
# look at relationships between variables

plot(faculty_salary$years_since, faculty_salary$salary) # looks ok

plot(faculty_salary$years_service, faculty_salary$salary) # looks ok

plot(faculty_salary$years_since, faculty_salary$years_service) # definitely colinear

# a. Set rank to class 'factor'
faculty_salary$rank <- as.factor(faculty_salary$rank)

# d. Reassign reference level of "rank" to "AsstProf":
faculty_salary$rank <- fct_relevel(faculty_salary$rank, "AsstProf")
levels(faculty_salary$rank)
## [1] "AsstProf"  "AssocProf" "Prof"
# a. Set field to class 'factor'
faculty_salary$field <- as.factor(faculty_salary$field)

# d. Reassign reference level of "Status" to "Regular":
faculty_salary$field <- fct_relevel(faculty_salary$field, "Theoretical")
levels(faculty_salary$field)
## [1] "Theoretical" "Applied"
# saturated model
test_lm1 <- lm(salary ~ rank + field + years_since + years_service + sex, data = faculty_salary)
summary(test_lm1)
## 
## Call:
## lm(formula = salary ~ rank + field + years_since + years_service + 
##     sex, data = faculty_salary)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65248 -13211  -1775  10384  99592 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    65955.2     4588.6  14.374  < 2e-16 ***
## rankAssocProf  12907.6     4145.3   3.114  0.00198 ** 
## rankProf       45066.0     4237.5  10.635  < 2e-16 ***
## fieldApplied   14417.6     2342.9   6.154 1.88e-09 ***
## years_since      535.1      241.0   2.220  0.02698 *  
## years_service   -489.5      211.9  -2.310  0.02143 *  
## sexMale         4783.5     3858.7   1.240  0.21584    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22540 on 390 degrees of freedom
## Multiple R-squared:  0.4547, Adjusted R-squared:  0.4463 
## F-statistic:  54.2 on 6 and 390 DF,  p-value: < 2.2e-16
plot(test_lm1)

# heteroskedastic

vif(test_lm1)
##                   GVIF Df GVIF^(1/(2*Df))
## rank          2.013193  2        1.191163
## field         1.064105  1        1.031555
## years_since   7.518936  1        2.742068
## years_service 5.923038  1        2.433729
## sex           1.030805  1        1.015285
# years since = 7.5
# years service = 5.9
# just as we expected, most likely colinear. Should be concerned. Lets try taking out years since phd. 


test_lm2 <- lm(salary ~ rank + field + years_service + sex, data = faculty_salary)
summary(test_lm2)
## 
## Call:
## lm(formula = salary ~ rank + field + years_service + sex, data = faculty_salary)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -64202 -14255  -1533  10571  99163 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   68351.67    4482.20  15.250  < 2e-16 ***
## rankAssocProf 14560.40    4098.32   3.553 0.000428 ***
## rankProf      49159.64    3834.49  12.820  < 2e-16 ***
## fieldApplied  13473.38    2315.50   5.819 1.24e-08 ***
## years_service   -88.78     111.64  -0.795 0.426958    
## sexMale        4771.25    3878.00   1.230 0.219311    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22650 on 391 degrees of freedom
## Multiple R-squared:  0.4478, Adjusted R-squared:  0.4407 
## F-statistic: 63.41 on 5 and 391 DF,  p-value: < 2.2e-16
plot(test_lm2)

#heteroskedastic


vif(test_lm2)
##                   GVIF Df GVIF^(1/(2*Df))
## rank          1.597441  2        1.124233
## field         1.029040  1        1.014416
## years_service 1.627110  1        1.275582
## sex           1.030803  1        1.015284
# everything is ok here



test_lm3 <- lm(salary ~ rank + field + years_since + sex, data = faculty_salary)
summary(test_lm3)
## 
## Call:
## lm(formula = salary ~ rank + field + years_since + sex, data = faculty_salary)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -67451 -13860  -1549  10716  97023 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   67884.32    4536.89  14.963  < 2e-16 ***
## rankAssocProf 13104.15    4167.31   3.145  0.00179 ** 
## rankProf      46032.55    4240.12  10.856  < 2e-16 ***
## fieldApplied  13937.47    2346.53   5.940 6.32e-09 ***
## years_since      61.01     127.01   0.480  0.63124    
## sexMale        4349.37    3875.39   1.122  0.26242    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22660 on 391 degrees of freedom
## Multiple R-squared:  0.4472, Adjusted R-squared:  0.4401 
## F-statistic: 63.27 on 5 and 391 DF,  p-value: < 2.2e-16
plot(test_lm3)

##### use this model^^^^^ 

### check AIC to determine which is better. Still we should trust our judgement here though

AIC(test_lm1)
## [1] 9093.826
#9093.826

AIC(test_lm2)
## [1] 9096.813
#9096.813

AIC(test_lm3)
## [1] 9097.22
#9097.22


ggplot(aes(x = years_service, y = salary, color = field), data = faculty_salary) +
  geom_point() + 
  geom_smooth(aes(years_service, salary, color = field), method = lm, se = FALSE)

ggplot(aes(x = years_since, y = salary, color = field), data = faculty_salary) +
  geom_point() + 
  geom_smooth(aes(years_since, salary, color = field), method = lm, se = FALSE) +
  theme_classic() +
  labs(x = "Years Since PHD", y = "Salary", color = "Field") +
  theme(text = element_text(family = "Century Schoolbook"))

ggplot(aes(x = years_service, y = salary, color = rank), data = faculty_salary) +
  geom_point() + 
  geom_smooth(aes(years_service, salary, color = rank), method = lm, se = FALSE)

ggplot(aes(x = years_since, y = salary, color = rank), data = faculty_salary) +
  geom_point() + 
  geom_smooth(aes(years_since, salary, color = rank), method = lm, se = FALSE)

##interaction term for field and years_since

test_lm4 <- lm(salary ~ rank + field + years_since + sex + years_since*field, data = faculty_salary)
summary(test_lm4)
## 
## Call:
## lm(formula = salary ~ rank + field + years_since + sex + years_since * 
##     field, data = faculty_salary)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69074 -14118  -1386  10575  95921 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              69396.694   5111.577  13.576  < 2e-16 ***
## rankAssocProf            12916.953   4180.553   3.090  0.00215 ** 
## rankProf                 45684.696   4277.531  10.680  < 2e-16 ***
## fieldApplied             11286.218   4739.175   2.381  0.01772 *  
## years_since                  8.332    151.148   0.055  0.95607    
## sexMale                   4464.127   3882.387   1.150  0.25091    
## fieldApplied:years_since   117.789    182.886   0.644  0.51991    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22680 on 390 degrees of freedom
## Multiple R-squared:  0.4478, Adjusted R-squared:  0.4393 
## F-statistic: 52.71 on 6 and 390 DF,  p-value: < 2.2e-16
vif(test_lm4)
##                       GVIF Df GVIF^(1/(2*Df))
## rank              2.019919  2        1.192157
## field             4.299846  1        2.073607
## years_since       2.920851  1        1.709050
## sex               1.030530  1        1.015150
## field:years_since 4.555340  1        2.134324
AIC(test_lm4)
## [1] 9098.798
#9098.798

##interaction term for rank and years_since
test_lm5 <- lm(salary~ rank + field + years_since + sex + years_since*rank, data = faculty_salary)
summary(test_lm5)
## 
## Call:
## lm(formula = salary ~ rank + field + years_since + sex + years_since * 
##     rank, data = faculty_salary)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -68714 -13522  -1724  10716  96337 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                68412.9     7392.3   9.255  < 2e-16 ***
## rankAssocProf              17767.8     8254.5   2.152    0.032 *  
## rankProf                   43639.4     7504.1   5.815 1.26e-08 ***
## fieldApplied               13934.1     2350.2   5.929 6.74e-09 ***
## years_since                  -11.1     1102.1  -0.010    0.992    
## sexMale                     4159.9     3886.6   1.070    0.285    
## rankAssocProf:years_since   -253.4     1139.8  -0.222    0.824    
## rankProf:years_since         144.3     1110.0   0.130    0.897    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22680 on 389 degrees of freedom
## Multiple R-squared:  0.4493, Adjusted R-squared:  0.4394 
## F-statistic: 45.34 on 7 and 389 DF,  p-value: < 2.2e-16
##interaction term for rank and years_service
test_lm6 <- lm(salary~ rank + field + years_service + sex + years_service*rank, data = faculty_salary)
summary(test_lm6)
## 
## Call:
## lm(formula = salary ~ rank + field + years_service + sex + years_service * 
##     rank, data = faculty_salary)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65007 -14267  -1367  10853  98612 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  65940.9     6375.2  10.343  < 2e-16 ***
## rankAssocProf                19680.3     6839.7   2.877  0.00423 ** 
## rankProf                     50761.6     6062.2   8.373 1.02e-15 ***
## fieldApplied                 13471.4     2318.4   5.811 1.30e-08 ***
## years_service                  935.6     1867.1   0.501  0.61660    
## sexMale                       4748.7     3886.3   1.222  0.22248    
## rankAssocProf:years_service  -1249.3     1888.4  -0.662  0.50864    
## rankProf:years_service        -987.9     1871.2  -0.528  0.59783    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22680 on 389 degrees of freedom
## Multiple R-squared:  0.4492, Adjusted R-squared:  0.4393 
## F-statistic: 45.33 on 7 and 389 DF,  p-value: < 2.2e-16
##interaction term for field and years_service
test_lm7 <- lm(salary~ rank + field + years_service + sex + years_service*field, data = faculty_salary)
summary(test_lm7)
## 
## Call:
## lm(formula = salary ~ rank + field + years_service + sex + years_service * 
##     field, data = faculty_salary)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -70632 -14191  -2098   9937  94331 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 72438.8     4805.6  15.074  < 2e-16 ***
## rankAssocProf               13899.7     4086.8   3.401 0.000741 ***
## rankProf                    48021.5     3846.7  12.484  < 2e-16 ***
## fieldApplied                 6255.7     3916.2   1.597 0.110989    
## years_service                -266.8      135.8  -1.965 0.050128 .  
## sexMale                      5196.1     3861.9   1.345 0.179254    
## fieldApplied:years_service    406.3      178.3   2.279 0.023221 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22530 on 390 degrees of freedom
## Multiple R-squared:  0.455,  Adjusted R-squared:  0.4467 
## F-statistic: 54.27 on 6 and 390 DF,  p-value: < 2.2e-16
stargazer(test_lm3, type = "html")
Dependent variable:
salary
rankAssocProf 13,104.150***
(4,167.315)
rankProf 46,032.550***
(4,240.120)
fieldApplied 13,937.470***
(2,346.534)
years_since 61.011
(127.010)
sexMale 4,349.366
(3,875.393)
Constant 67,884.320***
(4,536.892)
Observations 397
R2 0.447
Adjusted R2 0.440
Residual Std. Error 22,663.240 (df = 391)
F Statistic 63.266*** (df = 5; 391)
Note: p<0.1; p<0.05; p<0.01
######################## THIS IS NOT RIGHT ################################

#predict values and graph

df_new <- data.frame(rank = rep(c("Prof",
                                  "AsstProf",
                                  "AssocProf"),
                                times = 12,
                                each = 20), 
                     field = rep(c("Applied",
                                    "Theoretical"),
                                  each = 360), 
                     years_since = rep(seq(from = 1, 
                                           to = 60, 
                                           length = 20),
                                       times = 12), 
                     sex = rep(c("Male", 
                                 "Female"), 
                               each = 360),
                     salary = rep(seq(from = 55000,
                                    to = 235000, 
                                    length = 20), 
                                times = 12)) 

# a. Make predictions using the new data frame:
salary_predict <- as.data.frame(predict(test_lm4, newdata = df_new, se.fit = TRUE, interval = "confidence"))

# b. Bind predictions to the data to make it actually useful:
predict_df <- data.frame(df_new, salary_predict)
#line graph of predictions 
predict_graph <- ggplot(predict_df, aes(x = years_since, y = fit.fit)) +
  geom_line(aes(color = rank)) +
  facet_wrap(~field)
predict_graph